#R libraries

library(knitr)
library(yaml)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
require(graphics)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggthemes)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(stringr)
library(stringi)
library(mapproj)
library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(readr)
library(rio)
## 
## Attaching package: 'rio'
## The following object is masked from 'package:plotly':
## 
##     export
library(naniar)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(grid)

#Beer and Breweries data input from CSV files

# This reads in the Beers data from select folder file Beers.csv.
Beerdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Beers.csv",
                     header = T,sep = ",",na.strings = "NA",fill = TRUE)
head(Beerdata)
##                  Name Beer_ID   ABV IBU Brewery_id
## 1            Pub Beer    1436 0.050  NA        409
## 2         Devil's Cup    2265 0.066  NA        178
## 3 Rise of the Phoenix    2264 0.071  NA        178
## 4            Sinister    2263 0.090  NA        178
## 5       Sex and Candy    2262 0.075  NA        178
## 6        Black Exodus    2261 0.077  NA        178
##                            Style Ounces
## 1            American Pale Lager     12
## 2        American Pale Ale (APA)     12
## 3                   American IPA     12
## 4 American Double / Imperial IPA     12
## 5                   American IPA     12
## 6                  Oatmeal Stout     12

#Number of rows in Beer data

nrow(Beerdata)
## [1] 2410
# This reads in the Beers data from select folder file Breweries.csv.
Breweriesdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Breweries.csv",
                          header = T,sep = ",",na.strings = "NA", fill = TRUE)
head(Breweriesdata)
##   Brew_ID                      Name          City State
## 1       1        NorthGate Brewing    Minneapolis    MN
## 2       2 Against the Grain Brewery    Louisville    KY
## 3       3  Jack's Abby Craft Lagers    Framingham    MA
## 4       4 Mike Hess Brewing Company     San Diego    CA
## 5       5   Fort Point Beer Company San Francisco    CA
## 6       6     COAST Brewing Company    Charleston    SC

#Number of rows in Breweries data

nrow(Breweriesdata)
## [1] 558

Generating bar plots for Beerdata categorized by number of Ounces used to generate ABV

#Removing not applicable ABV values
Beerdata1 <- Beerdata %>% filter(Beerdata$ABV != "NA") 
Beerdata1$Ounces <- as.factor(Beerdata1$Ounces)

#Summary count of number of ABV values by categorized Ounces of beer
s <- Beerdata1 %>% group_by(Ounces)%>% summarize(ABV,count=n())
## `summarise()` regrouping output by 'Ounces' (override with `.groups` argument)
# Bar plot of ABV v. Ounces to see how much data is available
Beerdata1 %>% ggplot(aes(x = Ounces, y = s$count[1], fill=Ounces)) + geom_col() +
  ggtitle("Alcohol By Volume (ABV) data count by Ounce Category") +
  labs(y="Alcohol By Volume (ABV)") + 
  geom_text(aes(Ounces, s$count+100, label = s$count, fill = NULL), data = s)

#Generating geographic plots from breweries data (following chunk generate the final plot)

#makes a data frame with State name and abbreviation.
lookup = data.frame(abb = state.abb, State = state.name)
dc <- c("DC", "District of Columbia")
lookup <- rbind(lookup,dc)
# Change Column Name State to abb (abbreviation)
colnames(Breweriesdata)[4] = "abb" 

# Removes left space in state abb data taken from Breweries CSV
Breweriesdata$abb <- trimws(Breweriesdata$abb,which = c("left"))  

# make one dataset with state names and abb
Brewdata <- merge(Breweriesdata,lookup,"abb") 

Brewcount <- count(Brewdata,State,abb) #count up the occurrence of each state

colnames(Brewcount)[3] = "Breweries_Count" #change "n" to "Breweries_Count"

# added state name also changed to lower case
Brewcount$region <- tolower(Brewcount$State) 

Brewcount2 = Brewcount[-1] #removed first column from Brew count data state name 

states <- map_data("state") #contains data for states excluding Alaska, Hawaii

#Added Alaska to states as the states data does not include Alaska (Hawaii not in data so did not include in below data)
Alaska <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Alaska.csv",header = T,sep = ",",na.strings = "NA",
                          fill = TRUE);
states <- rbind(states,Alaska)

#map.df is data frame containing longitude and latitude to form state and breweries count by state
map.df <- merge(states,Brewcount2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]

#Breweries count by state using gradient graphics
plot <- map.df %>% ggplot(aes(x=long, y=lat, group = group)) +
  geom_polygon(aes(fill = Breweries_Count)) +  geom_path() + 
  scale_fill_gradientn(colours=rev(topo.colors(10)),na.value="grey90",
                       breaks=c(0,5,10,15,20,25,30,35,40,45,50))+
  ggtitle("Breweries Count by State") + coord_map()

#state center longitude and latitude (read data from csv file)
st_center <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/long_lat_statecenter.csv",header = T,sep = ",",na.strings = "NA",
                          fill = TRUE);
# state names changed to lower case
st_center$region <- tolower(st_center$region) 

#map.df1 merges state center with Breweries count data
map.df1 <- merge(st_center,Brewcount2, by="region", all.x = T)

# Adding state names and Breweries count data by state 
Nplot <- plot + geom_point(data=map.df1, aes(x=long, y=lat, 
                                             size = Breweries_Count, group = region),
                           show.legend = FALSE) + 
  geom_text(aes( x= long, y = lat,label = abb.x, group = region), 
            data = map.df1, vjust=-1.2) +
  geom_text(aes(x = long, y = lat, label = Breweries_Count, group = region), 
            data = map.df1,vjust=1.8)


#Code below adds scaled legend to the Breweries count by state
cplot <- Nplot + theme(legend.background = element_rect(fill="gray90", 
                                                        size=10, linetype="dotted"),
                       legend.key.height = unit(4,"lines")) + 
  theme(plot.title = element_text(size=14, face= "bold", colour= "black"))

#Final plot with state names and Breweries count by state

cplot  #plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)

The final plot shows that ‘Colorado’ has the highest (47)number of breweries followed by ‘California (39)’, ‘Michigan(32)’ and ‘Oregon(29)’ being the next three successors.

#Merging beer data and breweries data

colnames(Beerdata)[5] = "Brew_ID" #Changing columun name to combine both beer data and breweries data
Totaldata <- merge(Beerdata,Breweriesdata, by="Brew_ID", all.x = T)
colnames(Totaldata)[2] = "Beer_Name" #Changing the column name to Beer name
colnames(Totaldata)[8] = "Brewery_Name" # Changing the column name to Brewery name
colnames(Totaldata)[10] = "State" # Changing the column name back to State

#First 6 observations after merging beer data and breweries data

head(Totaldata,6)
##   Brew_ID     Beer_Name Beer_ID   ABV IBU                               Style
## 1       1  Get Together    2692 0.045  50                        American IPA
## 2       1 Maggie's Leap    2691 0.049  26                  Milk / Sweet Stout
## 3       1    Wall's End    2690 0.048  19                   English Brown Ale
## 4       1       Pumpion    2689 0.060  38                         Pumpkin Ale
## 5       1    Stronghold    2688 0.060  25                     American Porter
## 6       1   Parapet ESB    2687 0.056  47 Extra Special / Strong Bitter (ESB)
##   Ounces       Brewery_Name        City State
## 1     16 NorthGate Brewing  Minneapolis    MN
## 2     16 NorthGate Brewing  Minneapolis    MN
## 3     16 NorthGate Brewing  Minneapolis    MN
## 4     16 NorthGate Brewing  Minneapolis    MN
## 5     16 NorthGate Brewing  Minneapolis    MN
## 6     16 NorthGate Brewing  Minneapolis    MN

#Last 6 observations after merging beer data and breweries data

tail(Totaldata,6)
##      Brew_ID                 Beer_Name Beer_ID   ABV IBU
## 2405     556             Pilsner Ukiah      98 0.055  NA
## 2406     557  Heinnieweisse Weissebier      52 0.049  NA
## 2407     557           Snapperhead IPA      51 0.068  NA
## 2408     557         Moo Thunder Stout      50 0.049  NA
## 2409     557         Porkslap Pale Ale      49 0.043  NA
## 2410     558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                  Brewery_Name          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK

#Missing Values in ABV/IBU columns

table(is.na(Totaldata$ABV))# True value represents NA values in the column ABV in original data
## 
## FALSE  TRUE 
##  2348    62
table(is.na(Totaldata$IBU))# True value represents NA values in the column IBU in original data
## 
## FALSE  TRUE 
##  1405  1005
gg_miss_var(Totaldata)+ ggtitle("Missing data values by Category")

Graph and the values in table above provide the missing data values in ABV (65) and IBU (1005) columns from originally considered beer data.

#Median for ABV and IBU content for each state (final plot given in below chunk)

#Evaluating median of ABV and IBU (all NA values removed from data)
Median_ABV_IBU <- Totaldata %>% arrange(State) %>% group_by(State) %>% 
  summarize(Median_ABV = median(ABV, na.rm = TRUE), Median_IBU = median(IBU,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV_IBU <- data.frame(Median_ABV_IBU)

#Bar plot for median of ABV content for each state (This plot is added to IBU med plot below)
ABV_Medplot <- Median_ABV_IBU %>% ggplot() + 
  geom_bar(aes(x = State, y = Median_ABV, fill = State),group = Median_ABV_IBU$State, 
           width = 0.3, stat = 'identity', show.legend = FALSE)

#Bar plot for median of IBU content for each state (added to median of ABV plot)
ABV_IBU_Medplot <- ABV_Medplot + geom_bar(aes(x = as.numeric(factor(State)) + 0.4, 
                                              y = Median_IBU/1000),stat = 'identity',width = 0.3)
#Median IBU scaled down by 100 in order to set same y axis scale for ABV and IBU.
#Adding secondary y axis to the plot
ABV_IBU_Medplot1 <- ABV_IBU_Medplot + 
  scale_y_continuous(limits= c(0,0.072), labels = percent, name = "Alcohol By Volume (ABV) [%]", 
                      sec.axis = sec_axis(~.*1000,name = "International Bitterness Unit (IBU)")) +
  ggtitle('Median ABV and IBU Content by State') + theme(
# AXIS LABLES APPEARANCE
plot.title = element_text(size=14, face= "bold", colour= "black" ),
axis.title.x = element_text(size=12, face="bold", colour = "black"),    
axis.title.y = element_text(size=12, face="bold", colour = "black"),    
axis.text.x = element_text(size=12, face="bold", colour = "black"), 
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)

#Adding text to median values for ABV and IBU
ABV_IBU_Medplot2 <- ABV_IBU_Medplot1 + geom_text(aes(State, Median_ABV+0.0045, 
                                                     label = paste("ABV =",percent(Median_ABV, 
                                                                                   accuracy = 0.001)),
                                                     fill = NULL),
                                                 data = Median_ABV_IBU, angle = 90) +
  geom_text(aes(as.numeric(factor(State)) + 0.4, (Median_IBU/1000)+0.0032, 
                label = paste("IBU =",Median_IBU), fill = NULL), data = Median_ABV_IBU, angle = 90)

Median plot for ABV and IBU by state

ABV_IBU_Medplot2 
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)

From the median ABV and median IBU plot above it can be observed that ‘District of Columbia’ and ‘Kentucky’ have the highest median ABV values shown in percentage. ‘Maine’ and ‘West Virginia’ have the highest median IBU values.

Maximum ABV and IBU by State

#collecting max data
Max_ABV <- Totaldata %>% arrange(State) %>% group_by(State) %>% 
  summarize(Max_ABV = max(ABV, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_IBU <- Totaldata %>% arrange(State) %>% group_by(State) %>% 
  summarize(Max_IBU = max(IBU, na.rm=TRUE))
## Warning in max(IBU, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV$Cat <- "Max_ABV"
Max_IBU$Cat <- "Max_IBU"

#Bar plot for max ABV content for each state
Maxplot_ABV_IBU <- Max_ABV %>% ggplot(mapping = aes(x,y)) + 
  geom_bar(aes(x = State, y = Max_ABV, fill = State), group = Max_ABV$State, 
                                                   stat = 'identity', show.legend = FALSE) +
  ggtitle('Max ABV Content by State') + geom_text(aes(State, Max_ABV+0.0085, 
                                                      label = percent(Max_ABV, accuracy = 0.01),
                                                      fill = NULL), data = Max_ABV, angle = 90) + 
  geom_bar(data=Max_IBU, aes(x = State, y = Max_IBU, fill = State), stat = 'identity', 
           show.legend = FALSE) +
  geom_text(aes(State, Max_IBU+5, label = Max_IBU, fill = NULL), data = Max_IBU) +
  ggtitle('Max ABV and Max IBU Content by State') + facet_grid(Cat~., scale = "free_y",
                                                              labeller = label_parsed, switch = "y") + labs(x="State") + 
  theme(axis.title.y = element_blank(), strip.placement = "outside")

Maximum ABV and IBU vaule by state

Maxplot_ABV_IBU 
## Warning: Removed 1 rows containing missing values (geom_bar).

#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)

From the Max ABV and IBU plot it is observed that ‘Colorado’ has the max ABV value of 12.8% and ‘Oregon’ has the max IBU value of 138.

ABV statistical attributes

Stats <- Totaldata %>% summarize(Mean = mean(ABV, na.rm=TRUE),
                        Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
                        SD = sd(ABV,na.rm=T), N = n())
#Histogram and Density Plot
His_Den <- Totaldata %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) +
  geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('ABV Statistical Attributes - Histogram, Density and Box Plots') + 
  scale_x_continuous(expand = c(0,0), limit = c(0,0.14)) + labs(y="Density / Count")

#Box plot for ABV data
Box <- Totaldata %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) + 
  geom_boxplot(col='black',fill='#FF6666') + scale_x_continuous(expand = c(0,0), limit = c(0,0.14)) + scale_y_continuous(expand = c(0,0), limit = c(-0.4,0.4))

#Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_Den),
                ggplotGrob(Box),
                size = "first"))
## Warning: Removed 2 rows containing missing values (geom_bar).

Stats
##         Mean Median   Max   Min         SD    N
## 1 0.05977342  0.056 0.128 0.001 0.01354173 2410

All statically significant attributes for the ABV values are shown above The distribution of ABV is slightly right skewed. ABV of beers around 5% has the highest count. There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%, the median ABV is 5.6%. The mean ABV is 5.98%, median 5.6% and standard deviation of ABV is 1.35%.

ABV v. IBU scatter plot

#Scatter plot with 
Totaldata %>% filter(!is.na(ABV) & !is.na(IBU)) %>% 
  ggplot(aes(y=ABV, x=IBU)) + geom_point(position='jitter') + geom_smooth(method=loess)
## `geom_smooth()` using formula 'y ~ x'